/*
[] brackets indicate internal census variable name that can't be disclosed. 

Randomly select piks and store the seeds from each run through the loop. 
1) Create an empty base table on which to append the seeds (only do this once before running). 
2) Pull in the seed as a macro variable using the loop counter from the bash script.
3) Randomly sample using proc surveyselect, print seed output, and append seed to base table. 
4) Merge back to original data for regressions. 
*/

libname hannah '/projects/users/########/Snapshot2022/IntermediateData';
libname hantmp '/projects/users/########/Snapshot2022/IntermediateData/TempData';
libname ans '/projects/users/########/Snapshot2022/AnalysisData';

/*
* initialize a base empty seed table for appending later seeds;
* ONLY RUN THIS ONCE - DON'T OVERWRITE ON EACH RUN THROUGH BASH LOOP;
proc sql;
  create table hantmp.pikloopseeds
    (
    Label1 char(21) format=$21. informat=$21. label='Label1',
    cValue1 char(13) format=$13. informat=$13. label='cValue1',
    nValue1 num format=D12.3 label='nValue'
    );
quit;
*/

/* 
* get counties merged to individual piks (could do hex IDs);
* ONLY NEED TO DO THIS ONCE;
proc sql;
  create table hantmp.somecoll_pikscnt as select
  a.*, b.*
  from hantmp.rand_pikscnt as a 
    inner join hantmp.pikspine as b on a.pik=b.pik
  order by a.pik;
quit;

data hantmp.somecoll_pikscnt;
  set hantmp.somecoll_pikscnt (keep = pik [county id] somecoll);
  if somecoll=0 then delete;
  drop somecoll;
run;

* drop duplicates - get unique piks and their county IDs;
proc sort data = hantmp.somecoll_pikscnt nodupkey;
  by pik;
run;

* data must be sorted by strata;
proc sort data = hantmp.somecoll_pikscnt;
  by [county id];
run;
*/

/* 
* Already did - ONLY NEEDED TO DO THIS ONCE IF I STORE THESE COUNTY NSIZE;
* get frequencies in each strata;
proc freq data = hantmp.somecoll_pikscnt;
  tables [county id] / out=somecoll_2freq;
run;

* determine strata sizes. want 1000000 obs. need to define a dataset with that number;
*_nsize_ is the sample size from each strata, positive integer;
* If not a positive integer, deleted and error dataset created;
data somecoll_2freq2 somecoll_2error;
  set somecoll_2freq;
  sampnum=(percent * 1000000)/100;
  _nsize_ = round(sampnum,1);
  sampnum = round(sampnum,0.01);
  if _nsize_ = 0 then output somecoll_2error;
  if _nsize_ = 0 then delete;
  if count >= ##### then _nsize_ = #####;
  if count < ##### then _nsize_ = count;
output somecoll_2freq2;
run;
* keep strata variable and sample size in each stratum;
data somecoll_2freq3;
  set somecoll_2freq2;
  keep [county id] _nsize_;
run;
data somecoll_2err3;
  set somecoll_2error;
  keep [county id] _nsize_;
run;
proc append base = somecoll_2freq3 data = somecoll_2err3;
run;
proc sort data = somecoll_2freq3;
  by [county id];
run;
*/

* Pull in seed from bash loop counter and save as a macro variable;
%let seedi=%sysget(VAR1);

* randomly select one million piks and append the seed to the base table;
proc surveyselect data = hantmp.somecoll_pikscnt
  out = hantmp.somecoll_pikloop_lpdid
  sampsize = hantmp.somecoll_2freq3
  seed = &seedi;
  strata [county id];
run;

* now need to merge back to full data to make a file Stata can analyze;
proc sql;
  create table ans.somecoll_pikloop_lpdid as select
  a.*, b.*
  from ans.somecoll_analysis as a 
    inner join hantmp.somecoll_pikloop_lpdid as b on a.pik=b.pik
  order by a.pik, a.year;
quit;


/* 
*For specific groups (e.g., somecoll);
ods output Summary = Summary;
proc surveyselect data = hantmp.somecoll_piks
  out = hantmp.somecoll_pikloop
  method = srs
  sampsize = 1000000
  seed = config outseed;
run;

proc append base = hantmp.pikloopseeds data = summary;
run; quit;
*/

